import pandas as pd

"""
数据预处理
"""

def process_house_sales():
    house_sales = clean_house_sales(pd.read_csv('../data/house_sales.csv')) # 加载数据进行清洗
    house_sales['date'] = pd.to_datetime(house_sales['date']) # 日期转换
    fill_feature(house_sales)
    return house_sales

def clean_house_sales(house_sales):
    # 填充缺失值坐在的行
    house_sales.dropna(inplace=True, axis=0, how='any')

    # 使用正态分布排除存在异常的价格所在行
    Q1 = house_sales['price'].quantile(0.25, interpolation='midpoint')
    Q2 = house_sales['price'].quantile(0.75, interpolation='midpoint')
    IQR = Q2 - Q1

    house_sales = house_sales[(house_sales['price'] >= Q1 + 1.5 * IQR) & (house_sales['price'] <= Q2 + 1.5 * IQR)]

    return house_sales


def fill_feature(house_sales):
    house_sales['is_renovated'] = house_sales['yr_renovated'].apply(lambda x: 0 if x == 0 else 1)
    house_sales['age_built'] = house_sales['date'].dt.year - house_sales['yr_built']
